Previously Processed Dataset

Data Preparation

raw_data = read_csv(here::here("Data", "compressed_okcupid.csv")) 
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   age = col_double(),
##   body_type = col_character(),
##   education = col_character(),
##   essay0 = col_character(),
##   essay9 = col_character(),
##   ethnicity = col_character(),
##   height = col_double(),
##   edu = col_character(),
##   fit = col_character(),
##   race_ethnicity = col_character(),
##   height_group = col_character(),
##   long_words = col_double(),
##   flesch = col_double()
## )
collapse_ethnicity =  Vectorize(function(x) {
  if (is.na(x)) {
    return(x)
  }
  if (str_detect(x, ",")) {
    if (str_detect(str_to_lower(x), "white")) {
      return("multi / white")
    }
    return("multi")
  }
  return(str_to_lower(x))
})
cleaned_data = raw_data %>% fix_nas(c("unknown")) %>% mutate(race_ethnicity = collapse_ethnicity(ethnicity), X1 = NULL) %>% mutate_if((function(x) length(unique(x)) < 20), factor)

Descriptive Statistics

data_descriptives = cleaned_data %>% skim_to_list() 
data_descriptives[[1]] %>% kable(caption = "Text Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Text Variables
variable missing complete n min max empty n_unique
education 0 18831 18831 10 33 0 32
essay0 0 18831 18831 1 16943 0 18814
essay9 0 18831 18831 1 10849 0 18252
ethnicity 0 18831 18831 5 103 0 151
data_descriptives[[2]] %>% kable(caption = "Categorical Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Categorical Variables
variable missing complete n n_unique top_counts ordered
body_type 0 18831 18831 12 ath: 5569, ave: 4993, fit: 4799, thi: 1182 FALSE
edu 2030 16801 18831 2 Mor: 13584, Hig: 3217, NA: 2030 FALSE
fit 52 18779 18831 2 fit: 10543, not: 8236, NA: 52 FALSE
height_group 0 18831 18831 2 not: 12210, sho: 6621, NA: 0 FALSE
race_ethnicity 0 18831 18831 11 whi: 11510, asi: 2018, mul: 1821, his: 855 FALSE
data_descriptives[[3]] %>% kable(caption = "Continuous Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Continuous Variables
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
age 0 18831 18831 32.02 9.09 18 26 30 36 69 ▃▇▅▂▁▁▁▁
flesch 0 18831 18831 7.28 4.78 -15.59 4.85 6.73 8.96 268 ▇▁▁▁▁▁▁▁
height 0 18831 18831 70.51 3.04 3 69 70 72 95 ▁▁▁▁▁▇▂▁
long_words 0 18831 18831 11.32 13.28 0 3 8 15 446 ▇▁▁▁▁▁▁▁

Visualizations

Continuous Variables

cleaned_data %>% select_if(is.numeric) %>% ggpairs(progress = FALSE) + theme(axis.text.x = element_text(angle = 20, hjust = 1))

Categorical Variables

cleaned_data %>% select_if(is.factor) %>% pivot_longer(dplyr::everything()) %>% table() %>% as_tibble() %>% ggplot(aes(area = n, fill = value, label = value)) + geom_treemap() + geom_treemap_text(color = "white", place = "centre", grow = TRUE) + facet_wrap(~ name) + theme(legend.position = "none")

cleaned_data %>% select_if(is.factor) %>% na.omit() %>% mutate_all(collapse_to_other, n_categories = 4) %>% ggpairs(progress = FALSE) + theme(axis.text.x = element_text(angle = 20, hjust = 1))

Full Dataset

Data Preparation

full_raw_data = read_csv(here::here("Data", "full_ok_cupid.csv")) 
get_first_word = Vectorize(function(x){
  if (is.na(x)) {
    return(x)
  }
  str_split(x, " ")[[1]][1]
})

can_char_be_factor = function(x, n = 20) {
  if (is.character(x)) {
    return(length(unique(x)) < n)
  }
  return(FALSE)
}

get_diet = Vectorize(function(x, importance = FALSE) {
  split_char = str_split(x, " ")[[1]]
  if (length(split_char) == 1) {
    if (importance) {
      return(NA)
    }
    return(x)
  }
  if (importance) {
    return(split_char[1])
  }
  split_char[2]
})

other_importance = Vectorize(function(x) {
  if (is.na(x)) {
    return(x)
  }
  if (str_detect(x, "and")) {
     and_split = str_split(x, "and")[[1]]
     return(str_trim(and_split[2]))
  }
  if (str_detect(x, "but")) {
     but_split = str_split(x, "but")[[1]]
     return(str_trim(but_split[2]))
  }
  return(NA)
})

get_pets = Vectorize(function(x, pet_type) {
  if (is.na(x)) {
    return(x)
  }
  if (str_detect(x, paste("has", pet_type))) {
    return(paste("has", pet_type))
  }
  if (str_detect(x, paste("dislikes", pet_type))) {
    return(paste("dislikes", pet_type))
  }
  if (str_detect(x, paste("likes", pet_type))) {
    return(paste("likes", pet_type))
  }
  return(NA)
})

get_kids = Vectorize(function(x, wants = FALSE) {
  if (is.na(x)) {
    return(x)
  }
  if (wants) {
    if (str_detect(x, "doesn't want more")) {
      return("doesn't want more kids")
    }
    if (str_detect(x, "doesn't want")) {
      return("doesn't want kids")
    }
    if (str_detect(x, "might want")) {
      return("might want kids")
    }
    if (str_detect(x, "wants more")) {
      return("wants more kids")
    }
        if (str_detect(x, "wants")) {
      return("wants kids")
    }
  } else {
    if (str_detect(x, "doesn't have")) {
      return("doesn't have kids")
    }
    if (str_detect(x, "has kids")) {
      return("has kids")
    }
    if (str_detect(x, "has a kid")) {
      return("has a kid")
    }
  }
  return(NA)
})
location_data = full_raw_data %>% select(location) %>% unique() %>% bind_cols(geocode(unlist(.)))
san_fran_loc = c(-122.4194155, 37.77493)
all_locations = location_data %>% select(-location) %>% as.matrix()
location_data$distance = geosphere::distGeo(san_fran_loc, all_locations) * 0.000621371 # calculates distance between SF and all other locations and converts from meters to mils
location_data$visiting = location_data$distance > 25 # users are designated as "visiting" if the specified location on their profile is a certain distance from SF
full_cleaned_data = full_raw_data %>% 
  mutate_if(is.character, str_replace_all, pattern = "&rsquo;", replacement = "'") %>% 
  mutate(religion_raw = religion, religion_import = other_importance(religion), religion = get_first_word(religion), 
         sign_raw = sign, sign_import = other_importance(sign), sign = get_first_word(sign), 
         diet_raw = diet, diet_import = get_diet(diet, TRUE), diet = get_diet(diet),
         dogs = get_pets(pets, "dogs"), cats = get_pets(pets, "cats"), pets_raw = pets, pets = NULL,
         kids = get_kids(offspring), kids_import = get_kids(offspring, TRUE), kids_raw = offspring, offspring = NULL,
         speaks_en = str_detect(speaks, "english"), multi_ling = str_detect(speaks, ","), 
         ethnicity_raw = ethnicity, ethnicity = collapse_ethnicity(ethnicity),
         last_online_raw = last_online, last_online = ymd_hm(last_online), 
         year_last_online = year(last_online), month_last_online = month(last_online), day_last_online = day(last_online),
         time_since_online = as.period(max(last_online) - last_online), days_since_online = period_to_seconds(time_since_online) %>%  (function(x) x / 86400)) %>% 
  mutate_if(can_char_be_factor, factor) %>% left_join(location_data)

saveRDS(full_cleaned_data, here::here("Data", "full_ok_cupid_cleaned.rds"))
write_csv(full_cleaned_data, here::here("Data", "full_ok_cupid_cleaned.csv"))

Descriptive Statistics

full_data_descriptives = full_cleaned_data %>% skim_to_list() 
## Warning: No summary functions for vectors of class: Period.
## Coercing to character
full_data_descriptives[[1]] %>% kable(caption = "Text Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Text Variables
variable missing complete n min max empty n_unique
education 6628 53318 59946 10 33 0 32
essay0 5485 54461 59946 1 48854 0 54351
essay1 7571 52375 59946 1 7955 0 51517
essay2 9638 50308 59946 1 6129 0 48635
essay3 11476 48470 59946 1 4374 0 43533
essay4 10537 49409 59946 1 44469 0 49260
essay5 10847 49099 59946 1 30446 0 48964
essay6 13771 46175 59946 1 11385 0 43603
essay7 12450 47496 59946 1 3722 0 45555
essay8 19214 40732 59946 1 13304 0 39325
essay9 12602 47344 59946 1 11444 0 45444
ethnicity_raw 5680 54266 59946 5 103 0 217
job 8198 51748 59946 5 33 0 21
last_online_raw 0 59946 59946 16 16 0 30123
location 0 59946 59946 12 35 0 199
religion_raw 20226 39720 59946 5 42 0 45
sign_raw 11056 48890 59946 3 39 0 48
speaks 50 59896 59946 7 107 0 7647
time_since_online 0 59946 59946 2 15 0 30123
full_data_descriptives[[2]] %>% kable(caption = "Categorical Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Categorical Variables
variable missing complete n n_unique top_counts ordered
body_type 5296 54650 59946 12 ave: 14652, fit: 12711, ath: 11819, NA: 5296 FALSE
cats 31323 28623 59946 3 NA: 31323, lik: 18450, has: 7274, dis: 2899 FALSE
diet 24395 35551 59946 6 any: 27881, NA: 24395, veg: 4986, oth: 1790 FALSE
diet_import 31734 28212 59946 2 NA: 31734, mos: 21508, str: 6704 FALSE
diet_raw 24395 35551 59946 18 NA: 24395, mos: 16585, any: 6183, str: 5113 FALSE
dogs 22512 37434 59946 3 lik: 28380, NA: 22512, has: 8493, dis: 561 FALSE
drinks 2985 56961 59946 6 soc: 41780, rar: 5957, oft: 5164, not: 3267 FALSE
drugs 14080 45866 59946 3 nev: 37724, NA: 14080, som: 7732, oft: 410 FALSE
ethnicity 5680 54266 59946 11 whi: 32831, asi: 6134, NA: 5680, mul: 5051 FALSE
kids 38895 21051 59946 3 NA: 38895, doe: 16132, has: 2461, has: 2458 FALSE
kids_import 46885 13061 59946 5 NA: 46885, mig: 4403, doe: 4059, wan: 3790 FALSE
kids_raw 35561 24385 59946 15 NA: 35561, doe: 7560, doe: 3875, doe: 3565 FALSE
orientation 0 59946 59946 3 str: 51606, gay: 5573, bis: 2767, NA: 0 FALSE
pets_raw 19921 40025 59946 15 NA: 19921, lik: 14814, lik: 7224, lik: 4313 FALSE
religion 20226 39720 59946 9 NA: 20226, agn: 8812, oth: 7743, ath: 6985 FALSE
religion_import 32007 27939 59946 4 NA: 32007, not: 12212, lau: 8995, som: 4516 FALSE
sex 0 59946 59946 2 m: 35829, f: 24117, NA: 0 FALSE
sign 11056 48890 59946 12 NA: 11056, leo: 4374, gem: 4310, lib: 4207 FALSE
sign_import 23180 36766 59946 3 NA: 23180, it’: 19333, it : 16758, it : 675 FALSE
smokes 5512 54434 59946 5 no: 43896, NA: 5512, som: 3787, whe: 3040 FALSE
status 0 59946 59946 5 sin: 55697, see: 2064, ava: 1865, mar: 310 FALSE
full_data_descriptives[[4]] %>% kable(caption = "Dummy Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Dummy Variables
variable missing complete n mean count
multi_ling 50 59896 59946 0.51 TRU: 30824, FAL: 29072, NA: 50
speaks_en 50 59896 59946 1 TRU: 59896, NA: 50
visiting 0 59946 59946 0.043 FAL: 57390, TRU: 2556, NA: 0
full_data_descriptives[[5]] %>% kable(caption = "Continuous Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Continuous Variables
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
age 0 59946 59946 32.34 9.45 18 26 30 37 110 ▇▆▂▁▁▁▁▁
days_since_online 0 59946 59946 40.09 77.28 0 1.32 3.77 32.51 370.3 ▇▁▁▁▁▁▁▁
distance 0 59946 59946 10.47 104.38 3.4e-05 3.4e-05 3.4e-05 10.42 7642.77 ▇▁▁▁▁▁▁▁
height 3 59943 59946 68.3 3.99 1 66 68 71 95 ▁▁▁▁▁▇▂▁
income 0 59946 59946 20033.22 97346.19 -1 -1 -1 -1 1e+06 ▇▁▁▁▁▁▁▁
lat 0 59946 59946 37.77 0.33 12.24 37.77 37.77 37.8 55.95 ▁▁▁▁▇▁▁▁
lon 0 59946 59946 -122.28 2.2 -157.86 -122.42 -122.42 -122.27 109.2 ▁▇▁▁▁▁▁▁
month_last_online 0 59946 59946 5.89 1.65 1 6 6 6 12 ▁▁▁▇▁▁▁▁
year_last_online 0 59946 59946 2011.92 0.27 2011 2012 2012 2012 2012 ▁▁▁▁▁▁▁▇
full_data_descriptives[[6]] %>% kable(caption = "Date-time Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Date-time Variables
variable missing complete n min max median n_unique
last_online 0 59946 59946 2011-06-27 2012-07-01 2012-06-27 30123

Visualizations

Continuous Variables

full_cleaned_data %>% select_if(is.numeric) %>% select(-time_since_online) %>% ggpairs(progress = FALSE) + theme(axis.text.x = element_text(angle = 20, hjust = 1))

Categorical Variables

is_categorical = function(x) {
  is.factor(x) | is.logical(x)
}

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% pivot_longer(dplyr::everything()) %>% table() %>% as_tibble() %>% ggplot(aes(area = n, fill = value, label = value)) + geom_treemap() + geom_treemap_text(color = "white", place = "centre", grow = TRUE) + facet_wrap(~ name) + theme(legend.position = "none")

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% mutate_all(collapse_to_other, n_categories = 4) %>% pivot_longer(dplyr::everything()) %>% na.omit() %>% ggplot(aes(x = value)) + geom_bar() + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 20, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% na.omit() %>% pivot_longer(-sex) %>% ggplot(aes(fill = sex, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent)  + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% na.omit() %>% pivot_longer(-orientation) %>% ggplot(aes(fill = orientation, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent)  + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% mutate(ethnicity = collapse_to_other(ethnicity, 5))  %>% na.omit() %>% pivot_longer(-ethnicity) %>% ggplot(aes(fill = ethnicity, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent)  + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))